Agenda

Announcements

  • MDSR Ch 10 programming notebook assigned

MDSR Ch 10 Errata / Tips

  • Some sections don’t require programming, but please still include the headers for navigation purposes
  • p. 234-235: Rename the result for 20,000 simulations as sim_results20k
    • the authors reuse the object name sim_results for the example with 20,000 simulations, but this overwrites the object expected for the plot at the top of p. 235.
    • by renaming the object resulting form 20,000 simulations, the ggplot call will still be able to access the intended object

Purposes for Simulation

Intro to Simulation

Monte Carlo simulation

Light at Night

# data intake
NightLightRaw <- read_csv("LightatNight.csv", col_names = TRUE)
Parsed with column specification:
cols(
  Light = col_character(),
  BodyMass0 = col_double(),
  BodyMass8 = col_double(),
  BMChange = col_double(),
  Corticosterone = col_double(),
  DayPct = col_double(),
  Consumption = col_double(),
  GlucoseInt = col_character(),
  GTT15 = col_double(),
  GTT120 = col_double(),
  Activity = col_double()
)
# subsetting for comparison of interest
NightLight <- 
  NightLightRaw %>%
  filter(Light %in% c("bright", "dark")) %>%
  select(Light, BMChange)
# summary statistics for each group
favstats(BMChange ~ Light, data = NightLight)
ObsMean <- mean(BMChange ~ Light, data = NightLight, na.rm = TRUE)
ObsMeanDiff <- ObsMean["bright"] - ObsMean["dark"]

Light at Night Simulation

# simulate with mosaic::do()
NightLightSims <- 
  mosaic::do(1000) * 
    NightLight %>%
    mutate(Light = shuffle(Light)) %>%
    group_by(Light) %>%
    summarise(meanBMChange = mean(BMChange, na.rm = TRUE)) 

# results after `shuffle( )`
favstats(meanBMChange ~ Light, data = NightLightSims)

# mean differences for shuffled data
LightSimResults <- 
  NightLightSims %>%
  select(-.row) %>%
  spread(key = Light, value = meanBMChange) %>%
  mutate(meanDiff = bright - dark) 


# distribution of outcomes expected by chance (random simulations)
p <- 
  LightSimResults %>%
  ggplot(aes(x = meanDiff)) + 
  geom_density() 
p

# comparison with observed outcome
p + geom_vline(xintercept = ObsMeanDiff)

Back to NCI60

NCI60 data reduction

NCI60 data reduction

NCI60 <- read_csv("NCI60.csv") 
Parsed with column specification:
cols(
  .default = col_double(),
  Probe = col_character()
)
See spec(...) for full column specifications.
head(NCI60)
Spreads <- 
  NCI60 %>%
  gather(value = expression, key = cellLine, -Probe) %>%
  group_by(Probe) %>%
  summarise(N = n(), 
            spread = sd(expression)) %>%
  arrange(desc(spread)) %>%
  mutate(order = row_number())

Simulations for NCI60 data reduction

Sim_spreads <- 
  NCI60 %>%
  gather(value = expression, key = cellLine, -Probe) %>%
  mutate(Probe = shuffle(Probe)) %>%
  group_by(Probe) %>%
  summarise(N = n(), 
            spread = sd(expression)) %>%
  mutate(order = row_number())
threshold <- max(Sim_spreads$spread)
Spreads %>%
  filter(order <= 500) %>%
  ggplot(aes(x = order, y = spread)) + 
  ylim(limits = c(2.5, 7.5)) + 
  geom_line(size = 1) + 
  geom_hline(yintercept = threshold, color = "red", size = 1, linetype = 2) + 
  annotate("text", x = 400, y = threshold + 0.2, label = "Largest random outcome")

# head(Spreads, 10)

Randomness is the key to simulation

image credit: https://m.xkcd.com/1210/

image credit: https://m.xkcd.com/1210/

Random number generators

note about uniform[0, 1]

Brush with destiny

Simulation

n <- 10000
sim_meet <- data.frame(
  you <- runif(n, min = 0, max = 60), 
  destiny <- runif(n, min = 0, max = 60)) %>%
  mutate(result = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."))
tally(~ result, format = "percent", data = sim_meet)
result
     Same time, same place! So close, and yet so far... 
                      30.37                       69.63 
destinyModel <- binom.test(~ result, n, success = "Same time, same place!", data = sim_meet)
confint(destinyModel)
sim_meet %>%
  ggplot() + 
  geom_point(aes(x = destiny, y = you, color = result), alpha = 0.3) + 
  ylab("You arrive (minutes after 10am)") + 
  xlab("Destiny arrives (minutes after 10am)") + 
  ggtitle("(simulated) Probability you'll meet your destiny at Starbucks")

NA

Key principles in simulation

Key principles: Design

Brush with destiny design

  • Starbucks arrival time is uniform between 10am & 11am (60 minute window) for both
  • if you’re both there at the same time, you’ll definitely meet
  • Q: how might you challenge these assumptions?

Key principles: Modularity

starbucks_sim <- function(num_sim = 1000, wait = 10) { 
  # purpose: simulate chance meeting between two people 
  ### num_sim: number of times to replicate simulation
  ### wait: number of minutes between arrivals for successful meeting
sim_meet <- data.frame(
  you <- runif(n, min = 0, max = 60), 
  destiny <- runif(n, min = 0, max = 60)) %>%
  mutate(result = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."))
destinyModel <- binom.test(~ result, n, success = "Same time, same place!", data = sim_meet)
return(confint(destinyModel))
}
starbucks_sim()

Key principles: Reproducibility

starbucks_sim()
# choose seed
set.seed(23)  # set initial state of RNG seed
runif(n = 5)  # new outcome
[1] 0.5766 0.2231 0.3319 0.7107 0.8194
runif(n = 5)  # new outcome
[1] 0.4237 0.9635 0.9781 0.8405 0.9966
# this is why we say *psuedo* random
set.seed(23)  # set initial state of RNG seed
runif(n = 5)  # back to outcome 1 again
[1] 0.5766 0.2231 0.3319 0.7107 0.8194
# different seed
set.seed(301) # new seed
runif(n = 5)  # new outcome
[1] 0.597106 0.132231 0.002137 0.753970 0.614147

Key principles: Replication

simple_sim <- function(num_sim = 1000, wait = 10) { 
  # purpose: lean version of `starbucks_sim` to explore replication  
  you <- runif(num_sim, min = 0, max = 60)
  destiny <- runif(num_sim, min = 0, max = 60)
  return(sum(abs(you - destiny) <= wait) / num_sim)
}
reps <- 2500
params <- data.frame(num_sims = c(100, 400, 1600))
sim_results <- 
  params %>%
  group_by(num_sims) %>%
  dplyr::do(mosaic::do(reps) * simple_sim(num_sim = .$num_sims, wait = 10))

|=========================================================                             | 67% ~1 s remaining     
|======================================================================================|100% ~0 s remaining     
favstats(simple_sim ~ num_sims, data = sim_results)
sim_results %>%
  ggplot(aes(x = simple_sim, color = factor(num_sims))) + 
  geom_density(size = 2) + 
  scale_x_continuous("Proportion of times you meet")

Modifying the simulation of our Brush with Destiny

Modifying the simulation of our Brush with Destiny

Assumptions modified

  1. your arrival time is 5 minutes + random delays
    • your minimum possible time is 5 minutes
    • random delays: rexp(n, .1)
      • typically 10 minutes or less
      • possibly longer
      • never negative
  2. other person arrival time is unknown + random delays
    • fastest arrival is unknown, we assume uniform between 3 & 20 minutes
    • random delays: rexp(n, .1)
  3. maybe 5% chance you’re both willing to strike up a meaningful conversation with a stranger at Starbucks on any given day
  4. Q: Share some additional things we still haven’t considered here!
  • Note about modeling waiting time (i.e., delay)
    • Lots of “name-brand” probability distributions model waiting time scenarios
    • don’t get stuck on the details of that choice here, just pay attention in STAT 414!
    • our choice: ggplot() + geom_density(aes(x = rexp(n, .1)))
n <- 100000
sim_meetExp <- data.frame(
  you <- 5 + rexp(n, 0.1),                      # your arrival time (5 min + random delay)
  destiny <- runif(n, min = 3, max = 20) +      # fastest arrival is unknown between 3 & 20 min
             rexp(n, 0.1)) %>%                  # random delay
  mutate(opportunity = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."), 
         talkativeMood = sample(c(TRUE, FALSE), size = n,    
                                replace = TRUE, prob = c(0.05, 0.95)),
         result = ifelse(test = (opportunity == "Same time, same place!" & talkativeMood == TRUE), 
                         yes = "Bliss", no = "Oh well..."))
tally(~ result, format = "percent", data = sim_meetExp)
result
     Bliss Oh well... 
     2.512     97.488 
destinyModel <- binom.test(~ result, n, success = "Bliss", data = sim_meetExp)
confint(destinyModel)
sim_meetExp %>%
  ggplot() + 
  geom_point(aes(x = destiny, y = you, color = result), alpha = 0.3) + 
  ylab("You arrive (minutes after 10am)") + 
  xlab("Destiny arrives (minutes after 10am)") + 
  ggtitle("This is why it's so hard to meet people...") + 
  scale_x_continuous(limits = c(0, 60)) + 
  scale_y_continuous(limits = c(0, 60)) 

NA

Simulating Models

Simulating simple linear regression data

Consider a simple linear regression model:

\[y = \beta_0 + \beta_1*x\]

Simulating simple linear regression data

Consider a logistic regression model:

\[E[Y | X] = \beta_0 + \beta_1*X\]

\[y_i = b_0 + b_1*x_i + \epsilon_i\]

# beta_0 
intercept <- 26
# beta_1
beta <- -2.5
# sample size for our simulated model
n <- 5000
# X is a simulated random variable representing some measurable characteristic of the cases 
xtest <- rnorm(n, mean = 10, sd = 1)
# simulate errors: assume i.i.d. N(0, sigma)
errors <- rnorm(n, mean = 0, sd = 5)
# Y is a simulated random variable representing our response
ytest <- intercept + (xtest * beta) + errors
# fit the simulated logistic regression model
simreg <- lm(ytest ~ xtest)
# how did we do?
msummary(simreg)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  25.0274     0.7180    34.9   <2e-16 ***
xtest        -2.4117     0.0712   -33.9   <2e-16 ***

Residual standard error: 5.01 on 4998 degrees of freedom
Multiple R-squared:  0.187, Adjusted R-squared:  0.186 
F-statistic: 1.15e+03 on 1 and 4998 DF,  p-value: <2e-16
# confint(simreg)

Simulating logistic regression data

Consider a logistic regression model:

\[log(\frac{\pi}{1-\pi}) = \beta_0 + \beta_1*x\]

Simulating logistic regression data

Consider a logistic regression model:

\[log(\frac{\pi}{1-\pi}) = \beta_0 + \beta_1*x\]

\[log(\frac{p_i}{1-p_i}) = b_0 + b_1*x_i\]

# beta_0
intercept <- -1
# beta_1
beta <- 0.5
# sample size for our simulated model
n <- 5000
# X is a simulated random variable representing some measurable characteristic of the cases 
xtest <- rnorm(n, mean = 1, sd = 1)
# recall: the log-odds is a *linear* model with respect to our explanatory/predictor variables
linpred <- intercept + (xtest * beta)
# transform predictions to probabilities
prob <- exp(linpred)/(1 + exp(linpred))
# Y is a simulated random variable representing our response; the "errors" are built-in here
ytest <- ifelse(runif(n) < prob, 1, 0)
# fit the simulated logistic regression model
logreg <- glm(ytest ~ xtest, family=binomial)
# how did we do?
coef(logreg)
(Intercept)       xtest 
    -1.0136      0.5062 
confint(logreg)
Waiting for profiling to be done...
              2.5 %  97.5 %
(Intercept) -1.1048 -0.9238
xtest        0.4444  0.5687
## interpret odds
exp(coef(logreg))
(Intercept)       xtest 
     0.3629      1.6590 
exp(confint(logreg))
Waiting for profiling to be done...
             2.5 % 97.5 %
(Intercept) 0.3313  0.397
xtest       1.5596  1.766

Evaluating Model Assumptions

Evaluating Simple Linear Regression Assumptions

Challenging the equal variance assumption

\[E[Y|X] = \beta_0 + \beta_1X_1 + \beta_2X_2\]

\[y_i = b_0 + b_1x_{1i} + b_2x_{2i} + \epsilon_i\]

# sample size for our simulation
n <- 250
# sd of model errors
rmse <- 1
# two explanatory variables
x1 <- runif(n, min = 0, max = 15)
# model coefficients
beta0 <- -3
beta1 <- 0.5
# simulate response data
y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse)        # equal variance
# y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse + x1)   # UNequal variance
# plot data with smoother
data.frame(y, x1) %>%
  ggplot(aes(x = x1, y = y)) + 
  geom_point() +
  geom_smooth()

# regression model
simLM <- lm(y ~ x1)
# check the model fit
msummary(simLM)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -3.0248     0.1320   -22.9   <2e-16 ***
x1            0.4986     0.0149    33.4   <2e-16 ***

Residual standard error: 0.983 on 248 degrees of freedom
Multiple R-squared:  0.818, Adjusted R-squared:  0.817 
F-statistic: 1.11e+03 on 1 and 248 DF,  p-value: <2e-16
mplot(simLM)[1]  # residual diagnostics
[[1]]

Simulate many models

dosim <- function() {
  y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse + x1)   # departure
  mod <- lm(y ~ x1)
  result <- coefficients(mod)
  return(result)
}
sims <- mosaic::do(1000) * dosim()
favstats(~ x1, data = sims)
# distribution coefficient estimates
sims %>%
  ggplot(aes(x = x1)) +
  geom_density() +
  # pass arguments to `dnorm` function
  stat_function(fun = dnorm, args = list(mean = mean(sims$x1), sd = sd(sims$x1)),
                linetype = 2) +
  ggtitle("distribution of regression parameter") +
  scale_x_continuous("beta 1 coefficients")

Simulating a complex system

any_active <- function(df) {
  # return TRUE if someone has not finished
  return(max(df$endtime) == Inf)
}
next_customer <- function(df) {
  # returns the next customer in line
  res <- filter(df, endtime == Inf) %>%
    arrange(arrival)
  return(head(res, 1))
}
update_customer <- function(df, cust_num, end_time) {
  # sets the end time of a specific customer
  return(mutate(df, endtime = ifelse(custnum == cust_num, end_time, endtime)))
}
  
run_sim <- function(n = 1/2, m = 3/2, hours = 6) {
  # simulation of bank where there is just one teller
  # n: expected number of customers per minute
  # m: expected length of transaction is m minutes
  # hours: bank open for this many hours
  
  customers <- rpois(hours * 60, lambda = n)
  arrival <- numeric(sum(customers))
  position <- 1
  for (i in 1:length(customers)) {
    numcust <- customers[i]
    if (numcust != 0) {
      arrival[position:(position + numcust - 1)] <- rep(i, numcust)
      position <- position + numcust
    }
  }
  duration <- rexp(length(arrival), rate = 1/m)  # E[X]=m
  df <- data.frame(arrival, duration, custnum = 1:length(duration), 
                   endtime = Inf, stringsAsFactors = FALSE)
  
  endtime <- 0 # set up beginning of simulation
  while (any_active(df)) { # anyone left to serve
    next_one <- next_customer(df)
    now <- ifelse(next_one$arrival >= endtime, next_one$arrival, endtime)
    endtime <- now + next_one$duration
    df <- update_customer(df, next_one$custnum, endtime)
  }
  df <- mutate(df, totaltime = endtime - arrival)
  return(favstats(~ totaltime, data = df))
}
sim_results <- mosaic::do(3) * run_sim()
sim_results
---
title: "Simulation"
subtitle: "MDSR Ch 10"
output: 
  slidy_presentation: default
  html_notebook: default  
---


```{r Front Matter, echo=TRUE, message=FALSE, warning=FALSE, include=FALSE}
# clean up R environment
rm(list = ls())

# global options
knitr::opts_chunk$set(eval=TRUE, include=TRUE)
options(digits=4)

# packages used
library(mdsr)
library(tidyverse)

# inputs summary


```


## Agenda


#### Announcements

- MDSR Ch 10 programming notebook assigned

#### MDSR Ch 10 Errata / Tips

- Some sections don't require programming, but please still include the headers for navigation purposes
- p. 234-235: Rename the result for 20,000 simulations as `sim_results20k`
    - the authors reuse the object name `sim_results` for the example with 20,000 simulations, but this overwrites the object expected for the plot at the top of p. 235.  
    - by renaming the object resulting form 20,000 simulations, the ggplot call will still be able to access the intended object


## Purposes for Simulation

- Q: What comes to mind when you think of "simulation"
- Q: What are the purposes for simulation?


## Intro to Simulation

- Simulations are often useful to learn about the nature of a random process
- We build a realistic "model" of the system/process that we want to study
    - The act of building itself often leads to deeper understanding
    - We study outcomes produced by our model to build intuition about the nature of the system and it's outcomes
- **Winnowing out hypotheses**: 
    - we often have many competing hypotheses about how a real system works
    - we can simulate/generate data from each hypothesis and rule out the ones that aren't consistent with the observed data
    - e.g., our randomization test
- **Conditional inference**: 
    - build a system that reflects how a real system works
    - use the data that it produces to build intuition about the real world

## Monte Carlo simulation

- You'll hear the term "Monte Carlo" thrown around when people talk about simulation
- Monte Carlo simulation template 
    - define some domain of possible inputs
    - use a probability distribution to generate inputs over the domain
    - perform some deterministic computation on the inputs
    - aggregate the results
- Monte Carlo simulations rely on the **Law of Large Numbers** 
    - the average of a large sample ought to be close to the expected value 
    - the larger the sample, the closer it tends to be


## Light at Night

![](LightatNight.png)

- Researchers interested in the "link between the molecular circadian clock & metabolism"
- The study data shows that body mass of mice in "bright" group increased by about 3 grams more, on average, than mice in "dark" group
    - Q: is this a meaningful change, or just randomness?
    - Q: If it were all just randomness, how could we simulate outcomes that might be observed by chance alone?
        1. **Without** a computer?
        2. With a computer?

```{r}
# data intake
NightLightRaw <- read_csv("LightatNight.csv", col_names = TRUE)

# subsetting for comparison of interest
NightLight <- 
  NightLightRaw %>%
  filter(Light %in% c("bright", "dark")) %>%
  select(Light, BMChange)

# summary statistics for each group
favstats(BMChange ~ Light, data = NightLight)

ObsMean <- mean(BMChange ~ Light, data = NightLight, na.rm = TRUE)
ObsMeanDiff <- ObsMean["bright"] - ObsMean["dark"]
```

## Light at Night Simulation

```{r eval=FALSE, include=TRUE}
# simulate with mosaic::do()
NightLightSims <- 
  mosaic::do(1000) * 
    NightLight %>%
    mutate(Light = shuffle(Light)) %>%
    group_by(Light) %>%
    summarise(meanBMChange = mean(BMChange, na.rm = TRUE)) 

# results after `shuffle( )`
favstats(meanBMChange ~ Light, data = NightLightSims)

# mean differences for shuffled data
LightSimResults <- 
  NightLightSims %>%
  select(-.row) %>%
  spread(key = Light, value = meanBMChange) %>%
  mutate(meanDiff = bright - dark) 


# distribution of outcomes expected by chance (random simulations)
p <- 
  LightSimResults %>%
  ggplot(aes(x = meanDiff)) + 
  geom_density() 
p

# comparison with observed outcome
p + geom_vline(xintercept = ObsMeanDiff)
  
```



## Back to NCI60

- Cancer types are often named for the location where they're found
    - lung, 
    - ovarian, 
    - breast, 
    - etc
- **Problem: tissue of origin may not be the best indicator for appropriate treatment**
- Previously, we used unsupervised learning methods to explore relationships among various cancer cell lines 
    - Good idea, but it's a challenging problem
    - It's hard to say if we found much of value
- all cells have a genome with lots of different genes that the behavior
    - microarrays are used to examine the expression of individual genes within a cell
        - each microarray has many (dozens; hundreds; thousands) probes that provide a snapshot of gene activity
        - comparisons among cells in different states can reveal clues about which genes are linked to different aspects of cell activity
- Problem:
    - most genes regulate **mundane** behaviors typical of all cells... we don't care about these
    - some genes regulate **problem** behaviors related to cancer cells (e.g., over-rapid reproduction)
    - **we want to separate the vital few genes from the trivial many...**
    - Q: Any good ideas?

## NCI60 data reduction

- Goal: we want to separate the vital few genes from the trivial many 
    - most genes regulate **mundane** behaviors typical of all cells... we don't care about these
    - some genes regulate **problem** behaviors related to cancer cells (e.g., over-rapid reproduction)
- Simple Idea: 
    - if expression of a probe is the same across all available cancer samples, it isn't to differentiate them!
    - Q: How can we tell if a probe is the same across all available cancer samples?
    - Q: How will we separate the vital few from the trivial many? 

## NCI60 data reduction

- for now, we start with transposed NCI60 data 
    - probes are the cases
    - cell lines are the variables
    - this is the opposite of the form we used for our cluster analysis
- Goal: we want to calculate the SD for each probe among all cell lines
    - High variability across cell lines suggests the probe might be useful to differentiate among cancer types
    - Q: How will we know how high is high enough?
    - Q: How could you (hypothetically) solve this without a computer?


```{r}
NCI60 <- read_csv("NCI60.csv") 
head(NCI60)

Spreads <- 
  NCI60 %>%
  gather(value = expression, key = cellLine, -Probe) %>%
  group_by(Probe) %>%
  summarise(N = n(), 
            spread = sd(expression)) %>%
  arrange(desc(spread)) %>%
  mutate(order = row_number())
```

<!-- Day2 -->

## Simulations for NCI60 data reduction

- We want to simulate a null distribution such that:
    - we break any association between probe labels and gene expression measurements
    - All that remains is random variability
    - we then compare observed results with the random variability
- Q: How does the code shown accomplish this task?
- Q: What conclusions can you draw as a result?
- Q: Describe similarities & differences between this example & the process used in "Light at Night"?

```{r}
Sim_spreads <- 
  NCI60 %>%
  gather(value = expression, key = cellLine, -Probe) %>%
  mutate(Probe = shuffle(Probe)) %>%
  group_by(Probe) %>%
  summarise(N = n(), 
            spread = sd(expression)) %>%
  mutate(order = row_number())

threshold <- max(Sim_spreads$spread)

Spreads %>%
  filter(order <= 500) %>%
  ggplot(aes(x = order, y = spread)) + 
  ylim(limits = c(2.5, 7.5)) + 
  geom_line(size = 1) + 
  geom_hline(yintercept = threshold, color = "red", size = 1, linetype = 2) + 
  annotate("text", x = 400, y = threshold + 0.2, label = "Largest random outcome")

# head(Spreads, 10)

```

<!-- Day3 -->

## Randomness is the key to simulation

- random number generation is the workhorse of simulation
- Let's TRY IT!
    1. write down a series of twenty 0's and 1's in random order 
        - as random as you can
        - just use your brain, nothing else
    2. after you finish, let R try: `resample(0:1, 20)`
    3. Carefully inspect the results you and your neighbor created... is there anything surprising?
- Q: What are some other ways to generate truly random outcomes without a computer
    - Binary outcome (T/F)
    - Random assignment into three groups (equal in size)
    - Random assignment into three groups (NOT equal in size)
    - Random 10-digit number
    - Sample with replacement from a set of 43 measurements (e.g., each is the weight of an object)

![image credit: https://m.xkcd.com/1210/](random_xkcd1210.png)


## Random number generators

- random isn't haphazard... random has "structure"
- random number generators are the workhorse of simulation
    - hardware (true) random number generators
    - pseudo random number generators 
- surprisingly challenging to manufacture randomness!
    - 1996: Diehard tests
        - battery of 15 tests
        - <https://en.wikipedia.org/wiki/Diehard_tests>
    - 2007: TestU01 
        - Small Crush (10 tests)
        - Crush (96 tests)
        - Big crush (160 tests)


## Popular randomizing functions available in R

- **`runif( )`**
- `sample( )`
- `shuffle( )`
- `resample( )`
- name-brand probability distributions
    - `rnorm( )`
    - `rbinom( )`
    - etc...

```{r}
sample(c("red", "blue", TRUE, 2.71), size = 1)
sample(1:12, size = 1)
sample(letters, size = 1)
sample(LETTERS, size = 1)

```


## note about uniform[0, 1] 

- generating uniform random numbers are super important 
- any random scheme you want to create can almost certainly built up from random uniform
    - Q: coin flips?
    - Q: randomly select 20% of some set?
    - Q: sample from a Normal distribution?



## Brush with destiny

- Ever wonder how chance encounters might shape the rest of your life?
- There is someone at Penn State who is the very closest personal match for you among those you've never met
    - soulmate
    - greatest friend you could ever know
    - potential spouse
    - ideal business partner 
    - whatever floats your boat (and theirs, apparently!)
- You don't know this person, but you both always buy coffee at the HUB Bookstore between classes on MWF
    - You both have a class break from about 10 to 11am
    - You both swing by Starbucks sometime during that break
- Goal: What's the chance that the two of you meet?
    - Q: How can you simulate this scenario?  



## Simulation

```{r}
n <- 10000
sim_meet <- data.frame(
  you <- runif(n, min = 0, max = 60), 
  destiny <- runif(n, min = 0, max = 60)) %>%
  mutate(result = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."))

tally(~ result, format = "percent", data = sim_meet)

destinyModel <- binom.test(~ result, n, success = "Same time, same place!", data = sim_meet)
confint(destinyModel)
```


```{r}
sim_meet %>%
  ggplot() + 
  geom_point(aes(x = destiny, y = you, color = result), alpha = 0.3) + 
  ylab("You arrive (minutes after 10am)") + 
  xlab("Destiny arrives (minutes after 10am)") + 
  ggtitle("(simulated) Probability you'll meet your destiny at Starbucks")
  
```

<!-- Day4 -->

## Key principles in simulation

- design
- modularity
- replication



## Key principles: Design 

- start simple 
- add complexity gradually as you build up toward a more realistic simulation

#### Brush with destiny design

- Starbucks arrival time is **uniform** between 10am & 11am (60 minute window) for both
- if you're both there at the same time, you'll definitely meet
- Q: how might you challenge these assumptions?


## Key principles: Modularity

- often helpful to wrap up the simulation as a function
    - can then call it repeatedly (without copy/paste)
    - modify options and parameters as arguments to your function
- it pays to plan what features your simulation will need & consider splitting them off as different functions
    - easy to reuse those functions in other simulations
    - often easier to read the code

```{r}
starbucks_sim <- function(num_sim = 1000, wait = 10) { 
  # purpose: simulate chance meeting between two people 
  ### num_sim: number of times to replicate simulation
  ### wait: number of minutes between arrivals for successful meeting
sim_meet <- data.frame(
  you <- runif(n, min = 0, max = 60), 
  destiny <- runif(n, min = 0, max = 60)) %>%
  mutate(result = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."))

destinyModel <- binom.test(~ result, n, success = "Same time, same place!", data = sim_meet)
return(confint(destinyModel))
}

starbucks_sim()

```


## Key principles: Reproducibility

```{r}
starbucks_sim()
```

- uh oh... that's not the same answer we had last time!
- reproducibility of simulations
    - R functions like `runif( )` are called *psuedo* random because it's deterministic if the initial state is known
    - the initial state is called the "seed"
    - `set.seed( )` lets the user define the initial state in R
    - otherwise, R uses the system clock to choose a seed each time you run the code

```{r}
# choose seed
set.seed(23)  # set initial state of RNG seed
runif(n = 5)  # new outcome
runif(n = 5)  # new outcome

# this is why we say *psuedo* random
set.seed(23)  # set initial state of RNG seed
runif(n = 5)  # back to outcome 1 again

# different seed
set.seed(301) # new seed
runif(n = 5)  # new outcome

```


## Key principles: Replication

- how many times do we want to replicate this simulation? 
- results will be sensitive to the number of computations that we're willing to execute
- So, how many?  It's a balance:
    - more simulated replications leads to greater precision of our estimates
    - unnecessary calculations waste time and computing resources
- Q: compare the results shown
    - what is `num_sims` vs `n`?
    - what should `mean` & `sd` mean to us?


```{r}
simple_sim <- function(num_sim = 1000, wait = 10) { 
  # purpose: lean version of `starbucks_sim` to explore replication  
  you <- runif(num_sim, min = 0, max = 60)
  destiny <- runif(num_sim, min = 0, max = 60)
  return(sum(abs(you - destiny) <= wait) / num_sim)
}

reps <- 2500
params <- data.frame(num_sims = c(100, 400, 1600))
sim_results <- 
  params %>%
  group_by(num_sims) %>%
  dplyr::do(mosaic::do(reps) * simple_sim(num_sim = .$num_sims, wait = 10))
favstats(simple_sim ~ num_sims, data = sim_results)
```

```{r}
sim_results %>%
  ggplot(aes(x = simple_sim, color = factor(num_sims))) + 
  geom_density(size = 2) + 
  scale_x_continuous("Proportion of times you meet")
```




## Modifying the simulation of our Brush with Destiny

- Q: what's the average time it takes to arrive at Starbucks after class?
- Q: what's the average time it takes the other person???
- Q: By the way, what's the probability you actually talk to this person if they are even there??

## Modifying the simulation of our Brush with Destiny

#### Assumptions modified

1. your arrival time is 5 minutes + random delays 
    - your minimum possible time is 5 minutes
    - random delays: `rexp(n, .1)` 
        - typically 10 minutes or less
        - possibly longer 
        - never negative
2. other person arrival time is unknown + random delays
    - fastest arrival is unknown, we assume uniform between 3 & 20 minutes
    - random delays: `rexp(n, .1)` 
3. maybe 5% chance you're both willing to strike up a meaningful conversation with a stranger at Starbucks on any given day
4. Q: Share some additional things we still **haven't** considered here!
- Note about modeling waiting time (i.e., delay)
    - Lots of "name-brand" probability distributions model waiting time scenarios
    - don't get stuck on the details of that choice here, just pay attention in STAT 414!
    - our choice: `ggplot() + geom_density(aes(x = rexp(n, .1)))`


```{r}
n <- 100000
sim_meetExp <- data.frame(
  you <- 5 + rexp(n, 0.1),                      # your arrival time (5 min + random delay)
  destiny <- runif(n, min = 3, max = 20) +      # fastest arrival is unknown between 3 & 20 min
             rexp(n, 0.1)) %>%                  # random delay
  mutate(opportunity = ifelse(abs(you - destiny) <= 10, 
                         "Same time, same place!", "So close, and yet so far..."), 
         talkativeMood = sample(c(TRUE, FALSE), size = n,    
                                replace = TRUE, prob = c(0.05, 0.95)),
         result = ifelse(test = (opportunity == "Same time, same place!" & talkativeMood == TRUE), 
                         yes = "Bliss", no = "Oh well..."))

tally(~ result, format = "percent", data = sim_meetExp)

destinyModel <- binom.test(~ result, n, success = "Bliss", data = sim_meetExp)
confint(destinyModel)
```

```{r}
sim_meetExp %>%
  ggplot() + 
  geom_point(aes(x = destiny, y = you, color = result), alpha = 0.3) + 
  ylab("You arrive (minutes after 10am)") + 
  xlab("Destiny arrives (minutes after 10am)") + 
  ggtitle("This is why it's so hard to meet people...") + 
  scale_x_continuous(limits = c(0, 60)) + 
  scale_y_continuous(limits = c(0, 60)) 
  
```

<!-- Day5; moved advising sim to Day6 -->


## Simulating Models

- It's often handy to directly simulate a statistical model of some kind in order to build intuition about it's properties
    - Simulate data that could have been produced by some model
    - Simulate sensitivity to model assumptions



## Simulating simple linear regression data

Consider a simple linear regression model:

\[y = \beta_0 + \beta_1*x\]

- such that
    - Y is some measurable response variable that we want to predict
    - X is some measurable explanatory variable used to predict Y
- again, we need to integrate some desired model with some "noise" due to randomness
    - What will be **fixed**?
    - What will be **random**?


## Simulating simple linear regression data

Consider a logistic regression model:

\[E[Y | X] = \beta_0 + \beta_1*X\]

\[y_i = b_0 + b_1*x_i + \epsilon_i\]

- Q: What's the difference between these representations?
- Q: Can you spot our inputs in the model summary?
- Q: What if we change the sample size?


```{r}
# beta_0 
intercept <- 26

# beta_1
beta <- -2.5

# sample size for our simulated model
n <- 5000

# X is a simulated random variable representing some measurable characteristic of the cases 
xtest <- rnorm(n, mean = 10, sd = 1)

# simulate errors: assume i.i.d. N(0, sigma)
errors <- rnorm(n, mean = 0, sd = 5)

# Y is a simulated random variable representing our response
ytest <- intercept + (xtest * beta) + errors

# fit the simulated logistic regression model
simreg <- lm(ytest ~ xtest)

# how did we do?
msummary(simreg)
# confint(simreg)

```





## Simulating logistic regression data

Consider a logistic regression model:

\[log(\frac{\pi}{1-\pi}) = \beta_0 + \beta_1*x\]

- such that, 
    - $\pi$ represents the population probability of "success" according to some binary outcome of interest to us
    - X is some measurable explanatory variable used to predict the odds of "success"

- again, we need to integrate some desired model with some "noise" due to randomness
    - What will be **fixed**?
    - What will be **random**?


## Simulating logistic regression data

Consider a logistic regression model:

\[log(\frac{\pi}{1-\pi}) = \beta_0 + \beta_1*x\]

\[log(\frac{p_i}{1-p_i}) = b_0 + b_1*x_i\]

- Q: What's the difference between these representations?
- Q: Why this step? `ifelse(runif(n) < prob, 1, 0)`


```{r}
# beta_0
intercept <- -1

# beta_1
beta <- 0.5

# sample size for our simulated model
n <- 5000

# X is a simulated random variable representing some measurable characteristic of the cases 
xtest <- rnorm(n, mean = 1, sd = 1)

# recall: the log-odds is a *linear* model with respect to our explanatory/predictor variables
linpred <- intercept + (xtest * beta)

# transform predictions to probabilities
prob <- exp(linpred)/(1 + exp(linpred))

# Y is a simulated random variable representing our response; the "errors" are built-in here
ytest <- ifelse(runif(n) < prob, 1, 0)

# fit the simulated logistic regression model
logreg <- glm(ytest ~ xtest, family=binomial)

# how did we do?
coef(logreg)
confint(logreg)

## interpret odds
exp(coef(logreg))
exp(confint(logreg))
```



## Evaluating Model Assumptions

- All models have assumptions of one sort or another
- When assumptions are violated, the conclusions of the model may be jeopardized
    - Not all violations are equally dangerous...
    - Q: **How can we tell which is which??**


## Evaluating Simple Linear Regression Assumptions

- Q: What are the (4) assumptions of linear regression?
- Q: How might we use simulation to "challenge" some of these assumptions and see how sensitive they are?


## Challenging the equal variance assumption

\[E[Y|X] = \beta_0 + \beta_1X_1 + \beta_2X_2\]

\[y_i = b_0 + b_1x_{1i} + b_2x_{2i} + \epsilon_i\]

- Q: How might we **challenge** the equal variance assumption?
- Q: What actually **goes wrong** when we have a problem of unequal variance?

```{r}
# sample size for our simulation
n <- 250

# sd of model errors
rmse <- 1

# two explanatory variables
x1 <- runif(n, min = 0, max = 15)

# model coefficients
beta0 <- -3
beta1 <- 0.5


# simulate response data
y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse)        # equal variance
# y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse + x1)   # UNequal variance

# plot data with smoother
data.frame(y, x1) %>%
  ggplot(aes(x = x1, y = y)) + 
  geom_point() +
  geom_smooth()

# regression model
simLM <- lm(y ~ x1)


# check the model fit
msummary(simLM)
mplot(simLM)[1]  # residual diagnostics


```




## Simulate many models

- Q: What do expect the distribution of parameter estimates to look like?
    - If equal variance assumption is satisfied
    - If equal variance assumption is NOT satisfied


```{r}
dosim <- function() {
  y <- beta0 + beta1*x1 + rnorm(n, mean = 0, sd = rmse + x1)   # departure
  mod <- lm(y ~ x1)
  result <- coefficients(mod)
  return(result)

}

sims <- mosaic::do(1000) * dosim()
favstats(~ x1, data = sims)

# distribution coefficient estimates
sims %>%
  ggplot(aes(x = x1)) +
  geom_density() +
  # pass arguments to `dnorm` function
  stat_function(fun = dnorm, args = list(mean = mean(sims$x1), sd = sd(sims$x1)),
                linetype = 2) +
  ggtitle("distribution of regression parameter") +
  scale_x_continuous("beta 1 coefficients")

```




## Simulating a complex system


```{r}
any_active <- function(df) {
  # return TRUE if someone has not finished
  return(max(df$endtime) == Inf)
}

next_customer <- function(df) {
  # returns the next customer in line
  res <- filter(df, endtime == Inf) %>%
    arrange(arrival)
  return(head(res, 1))
}

update_customer <- function(df, cust_num, end_time) {
  # sets the end time of a specific customer
  return(mutate(df, endtime = ifelse(custnum == cust_num, end_time, endtime)))
}
  
```


```{r}
run_sim <- function(n = 1/2, m = 3/2, hours = 6) {
  # simulation of bank where there is just one teller
  # n: expected number of customers per minute
  # m: expected length of transaction is m minutes
  # hours: bank open for this many hours
  
  customers <- rpois(hours * 60, lambda = n)
  arrival <- numeric(sum(customers))
  position <- 1
  for (i in 1:length(customers)) {
    numcust <- customers[i]
    if (numcust != 0) {
      arrival[position:(position + numcust - 1)] <- rep(i, numcust)
      position <- position + numcust
    }
  }
  duration <- rexp(length(arrival), rate = 1/m)  # E[X]=m
  df <- data.frame(arrival, duration, custnum = 1:length(duration), 
                   endtime = Inf, stringsAsFactors = FALSE)
  
  endtime <- 0 # set up beginning of simulation
  while (any_active(df)) { # anyone left to serve
    next_one <- next_customer(df)
    now <- ifelse(next_one$arrival >= endtime, next_one$arrival, endtime)
    endtime <- now + next_one$duration
    df <- update_customer(df, next_one$custnum, endtime)
  }
  df <- mutate(df, totaltime = endtime - arrival)
  return(favstats(~ totaltime, data = df))
}

```


```{r}
sim_results <- mosaic::do(3) * run_sim()
sim_results
```



